# download_mjee_issue_live.py
# MJEE (Majlesi Journal of Electrical Engineering) Downloader
# -------------------------------------------------
# Automates downloading PDFs from MJEE issue pages (mjee.isfahan.iau.ir).
# - Accepts a live issue URL as input.
# - Parses Volume, Issue, and Year from the "Volume & Issue" header.
# - Retrieves article titles from <article class='article-summary'> blocks.
# - Matches each title with its corresponding PDF link in the same block.
# - Resolves relative PDF URLs using urljoin to create full download links.
# - Creates output folder in the format: MJEE_{vol}_{iss}_{year}.
# - Saves PDFs using sanitized titles (Windows-safe filenames).
# - Logs all downloads (title, article URL, PDF URL, status) to a CSV file.
# - Compatible with all MJEE issues without requiring code changes.

"""
Downloader for MJEE (https://mjee.isfahan.iau.ir) — LIVE issue URL

• Input: Issue URL (e.g., https://mjee.isfahan.iau.ir/issue_1137578_1137579.html)
• Output folder: ./MJEE_{vol}_{iss}_{year}
• Filenames: <Article Title>.pdf
• CSV log inside the output folder

Usage
-----
python download_mjee_issue_live.py "https://mjee.isfahan.iau.ir/issue_1137578_1137579.html"
# Or run without args and paste the URL when prompted.

Args
----
--dry-run    : Parse and list items without downloading
--max N      : Download at most N PDFs
--delay S    : Sleep S seconds between downloads (politeness)
"""


import re
import csv
import sys
import time
import argparse
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
TIMEOUT = 60
RETRY_SLEEP = 2
MAX_RETRIES = 3

SCRIPT_DIR = Path(__file__).resolve().parent


def sanitize_filename(name: str) -> str:
    name = re.sub(r"[\\/*?:\"<>|]", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]


def get_soup(url: str) -> BeautifulSoup:
    last_exc = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            return BeautifulSoup(r.text, "html.parser")
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def detect_base(issue_url: str, soup: BeautifulSoup) -> str:
    base = soup.find("base")
    if base and base.get("href"):
        return base["href"].strip()
    p = urlparse(issue_url)
    return f"{p.scheme}://{p.netloc}/"


def parse_issue_meta(soup: BeautifulSoup):
    """Extract vol, issue, year from the 'Volume & Issue' header.
    Expected markup:
    <h4>Volume & Issue:  <span>Volume 14, Issue 1, March 2020</span>
    """
    # Prefer the explicit header
    header = None
    for tag in soup.select("h4, h3, div, span"):
        txt = tag.get_text(" ", strip=True)
        if "Volume & Issue" in txt:
            header = tag
            break
    text = ""
    if header:
        span = header.find("span")
        text = span.get_text(" ", strip=True) if span else header.get_text(" ", strip=True)
    if not text:
        text = soup.get_text(" ", strip=True)

    # Normalize weird spaces
    text = text.replace("\xa0", " ")

    # Primary pattern: "Volume 14, Issue 1, March 2020"
    m = re.search(r"Volume\s+(\d+)\s*,\s*Issue\s+(\d+).*?((?:19|20)\d{2})", text, re.IGNORECASE)
    if m:
        vol, iss, year = m.group(1), m.group(2), m.group(3)
        return vol, iss, year

    # Fallbacks
    vol = re.search(r"Volume\s+(\d+)", text, re.IGNORECASE)
    iss = re.search(r"Issue\s+(\d+)", text, re.IGNORECASE)
    year = re.search(r"(19|20)\d{2}", text)
    return (vol.group(1) if vol else "Vol", iss.group(1) if iss else "Issue", year.group(0) if year else "Year")


def collect_articles(soup: BeautifulSoup):
    """Return list of (title_text, article_href, pdf_href) from the article list.
    Structure validated against provided HTML source.
    """
    results = []

    # Main container appears to be <div class="articleList boxed"> with <article class='article-summary'> entries
    blocks = soup.select("div.articleList article.article-summary")

    for blk in blocks:
        # Title link
        title_a = blk.select_one("h5 a[href]")
        if not title_a:
            continue
        title = title_a.get_text(" ", strip=True)
        art_rel = title_a.get("href", "").strip()

        # PDF link within the same article block
        pdf_a = blk.select_one("ul.actions a.pdf_link[href]")
        if not pdf_a:
            # Sometimes href might not carry class; try href ending with .pdf
            pdf_a = blk.select_one("ul.actions a[href$='.pdf']")
        if not pdf_a:
            # No PDF link; skip
            continue
        pdf_rel = pdf_a.get("href", "").strip()

        results.append((title, art_rel, pdf_rel))

    # unique by article href
    seen = set()
    uniq = []
    for t, ah, ph in results:
        key = (ah, ph)
        if key in seen:
            continue
        seen.add(key)
        uniq.append((t, ah, ph))

    return uniq


def ensure_pdf_response(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    if "pdf" in ctype:
        return True
    return resp.content[:5] == b"%PDF-"


def download_file(url: str, dest: Path, delay: float = 0.0):
    last_exc = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            if not ensure_pdf_response(r):
                raise ValueError(f"Non-PDF response (Content-Type={r.headers.get('Content-Type')})")
            dest.parent.mkdir(parents=True, exist_ok=True)
            with open(dest, "wb") as f:
                f.write(r.content)
            if delay > 0:
                time.sleep(delay)
            return
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def main():
    parser = argparse.ArgumentParser(description="Download MJEE issue PDFs by live URL")
    parser.add_argument("issue_url", nargs="?", help="Issue URL, e.g., https://mjee.isfahan.iau.ir/issue_1137578_1137579.html")
    parser.add_argument("--dry-run", action="store_true", help="List items without downloading")
    parser.add_argument("--max", type=int, default=0, help="Download at most N PDFs")
    parser.add_argument("--delay", type=float, default=0.0, help="Seconds to sleep between downloads")
    args = parser.parse_args()

    issue_url = args.issue_url or input("Paste MJEE issue URL (e.g., https://mjee.isfahan.iau.ir/issue_1137578_1137579.html): ").strip()
    if not issue_url:
        print("ERROR: No URL provided.")
        sys.exit(1)

    print(f"[INFO] Fetching issue page: {issue_url}")
    soup = get_soup(issue_url)

    base_url = detect_base(issue_url, soup)
    vol, iss, year = parse_issue_meta(soup)

    out_folder = SCRIPT_DIR / f"MJEE_{vol}_{iss}_{year}"
    out_folder.mkdir(parents=True, exist_ok=True)

    log_path = out_folder / f"MJEE_{vol}_{iss}_{year}_log.csv"

    articles = collect_articles(soup)
    print(f"[INFO] Found {len(articles)} candidate items with PDFs")

    if args.dry_run:
        for idx, (title, art_rel, pdf_rel) in enumerate(articles, 1):
            art_url = urljoin(base_url, art_rel)
            pdf_url = urljoin(base_url, pdf_rel)
            print(f"[{idx}] {title}\n    Article: {art_url}\n    PDF    : {pdf_url}")
        print("[DRY-RUN] No downloads performed.")
        return

    saved = 0
    with open(log_path, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Title", "Article URL", "PDF URL", "Filename", "Status"]) 

        for idx, (title, art_rel, pdf_rel) in enumerate(articles, 1):
            if args.max and saved >= args.max:
                break

            art_url = urljoin(base_url, art_rel)
            pdf_url = urljoin(base_url, pdf_rel)
            safe_title = sanitize_filename(title)
            outfile = out_folder / f"{safe_title}.pdf"

            if outfile.exists() and outfile.stat().st_size > 0:
                writer.writerow([title, art_url, pdf_url, outfile.name, "Exists"])
                print(f"[{idx}] ✅ Exists: {outfile.name}")
                continue

            print(f"[{idx}] Downloading: {safe_title}")
            try:
                download_file(pdf_url, outfile, delay=args.delay)
                writer.writerow([title, art_url, pdf_url, outfile.name, "OK"])
                print(f"    ✅ Saved: {outfile.name}")
                saved += 1
            except Exception as e:
                writer.writerow([title, art_url, pdf_url, outfile.name, f"Error: {e}"])
                print(f"    ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {out_folder}")
    print(f"Log: {log_path}")


if __name__ == "__main__":
    main()
